Read the dataset used in this assignment. Pay attention to the extension of the datafile.
data = read_xlsx('C:/Users/Deanne/Desktop/Term 4/SUBJECTS/STATS PROGRAMMING/assignment_3_dataset.xlsx')
If you find values in the dataset during the EDA, that are not correct based on the provided descriptions of the variables of the dataset please correct them here.
data = data %>%
mutate(sex = dplyr::recode(sex, "male" = 1, "female" = 2, "woman" = 2)) %>%
mutate(pain = case_when(pain > 10 ~ 10, .default = pain)) %>%
mutate(mindfulness = case_when(mindfulness > 6 ~ 6, .default = mindfulness)) %>%
mutate(cortisol = round((cortisol_serum + cortisol_saliva) / 2, digits = 2))
data
## # A tibble: 160 × 13
## ID pain sex age STAI_trait pain_cat cortisol_serum cortisol_saliva
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ID_1 5 2 38 39 25 4.67 4.78
## 2 ID_2 4 2 36 46 31 6.01 6.71
## 3 ID_3 5 2 51 49 32 5.18 4.75
## 4 ID_4 7 2 39 48 41 6.65 6.68
## 5 ID_5 4 1 48 36 26 2.95 3.2
## 6 ID_6 6 2 45 37 28 4.32 4.36
## 7 ID_7 4 1 45 34 22 5.32 5.19
## 8 ID_8 5 2 35 39 32 5.26 5.53
## 9 ID_9 4 1 39 31 27 3.36 3.3
## 10 ID_10 5 2 33 38 36 5.96 5.78
## # ℹ 150 more rows
## # ℹ 5 more variables: mindfulness <dbl>, weight <dbl>, IQ <dbl>,
## # household_income <dbl>, cortisol <dbl>
N = nrow(data)
N
## [1] 160
In order to test the more complex model for outliers and to test the assumptions first build the model.
# data filtered to relevant variables only
simple_data = data %>% select(sex, age, pain)
# some plotting
simple_data_plot = plot_ly(data = simple_data, x=~sex, y=~age, z=~pain, type="scatter3d") %>%
layout(showlegend = FALSE)
simple_data_plot
# create a simple linear regression model
simple_model = lm(pain ~ sex + age, data = simple_data)
summary(simple_model)
##
## Call:
## lm(formula = pain ~ sex + age, data = simple_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.7245 -1.0069 0.0985 0.9990 5.0985
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.61249 1.05866 8.135 1.17e-13 ***
## sex -0.04122 0.24019 -0.172 0.863972
## age -0.08850 0.02386 -3.710 0.000287 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.516 on 157 degrees of freedom
## Multiple R-squared: 0.0806, Adjusted R-squared: 0.06888
## F-statistic: 6.881 on 2 and 157 DF, p-value: 0.001365
Check for outlier values in the model.
# add Cook's distance
simple_data = simple_data %>% mutate(cooks_d = cooks.distance(simple_model))
# threshold for outlier detection
threshold = 4/N
threshold
## [1] 0.025
# show outliers
simple_data %>% filter(cooks_d >= threshold)
## # A tibble: 7 × 4
## sex age pain cooks_d
## <dbl> <dbl> <dbl> <dbl>
## 1 2 43 1 0.0277
## 2 1 46 8 0.0360
## 3 2 33 3 0.0271
## 4 2 44 8 0.0251
## 5 2 43 1 0.0277
## 6 2 44 1 0.0293
## 7 2 41 10 0.0462
Check the normality assumption.
hist(simple_data$age)
shapiro.test(simple_data$age)
##
## Shapiro-Wilk normality test
##
## data: simple_data$age
## W = 0.99013, p-value = 0.3293
hist(simple_data$sex)
shapiro.test(simple_data$sex)
##
## Shapiro-Wilk normality test
##
## data: simple_data$sex
## W = 0.6355, p-value < 2.2e-16
Check the linearity assumption.
cor(simple_data$age, simple_data$pain)
## [1] -0.2835908
cor(simple_data$sex, simple_data$pain)
## [1] -0.002197505
Check the homoscedasticty assumption (homogeneity of variance).
ncvTest(simple_model)
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 2.513787, Df = 1, p = 0.11285
Multicollinearity assumption
vif(simple_model)
## sex age
## 1.001486 1.001486
If based on the assumption tests you decide to drop a predictor variable you should do that here. Create your updated model.
# data filtered to relevant variables only
complex_data = data %>% select(sex, age, STAI_trait, pain_cat, mindfulness, cortisol, pain)
# create a simple linear regression model
complex_model = lm(pain ~ sex + age + STAI_trait + pain_cat + mindfulness + cortisol, data = complex_data)
summary(complex_model)
##
## Call:
## lm(formula = pain ~ sex + age + STAI_trait + pain_cat + mindfulness +
## cortisol, data = complex_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.2112 -0.8343 -0.0343 0.7959 4.6105
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.69609 1.79805 0.943 0.34702
## sex -0.27704 0.21870 -1.267 0.20715
## age -0.02492 0.02459 -1.013 0.31253
## STAI_trait -0.01974 0.02836 -0.696 0.48745
## pain_cat 0.09033 0.02916 3.097 0.00232 **
## mindfulness -0.11075 0.12979 -0.853 0.39482
## cortisol 0.63331 0.13387 4.731 5.05e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.292 on 153 degrees of freedom
## Multiple R-squared: 0.3497, Adjusted R-squared: 0.3242
## F-statistic: 13.71 on 6 and 153 DF, p-value: 1.977e-12
complex_data = complex_data %>% mutate(cooks_d = cooks.distance(complex_model))
complex_data %>% filter(cooks_d >= threshold)
## # A tibble: 9 × 8
## sex age STAI_trait pain_cat mindfulness cortisol pain cooks_d
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2 44 47 28 4.01 5.43 2 0.0259
## 2 1 46 37 24 3.1 6.58 8 0.0569
## 3 2 38 38 21 3.67 5.18 7 0.0338
## 4 2 43 50 26 3.96 3.99 1 0.0623
## 5 1 37 41 31 1 6.6 4 0.0551
## 6 1 39 36 39 4.34 3.79 7 0.0310
## 7 2 47 41 24 1.28 4.6 6 0.0275
## 8 2 41 44 30 3.3 5.99 10 0.0383
## 9 1 48 48 30 2.55 5.54 2 0.0470
Normality assumption
hist(complex_data$STAI_trait)
shapiro.test(complex_data$STAI_trait)
##
## Shapiro-Wilk normality test
##
## data: complex_data$STAI_trait
## W = 0.98838, p-value = 0.2086
hist(complex_data$pain_cat)
shapiro.test(complex_data$pain_cat)
##
## Shapiro-Wilk normality test
##
## data: complex_data$pain_cat
## W = 0.97803, p-value = 0.01189
hist(complex_data$mindfulness)
shapiro.test(complex_data$mindfulness)
##
## Shapiro-Wilk normality test
##
## data: complex_data$mindfulness
## W = 0.99426, p-value = 0.7862
hist(complex_data$cortisol)
shapiro.test(complex_data$cortisol)
##
## Shapiro-Wilk normality test
##
## data: complex_data$cortisol
## W = 0.9844, p-value = 0.0692
Linearity assumption
cor(complex_data$STAI_trait, complex_data$pain)
## [1] 0.2532565
cor(complex_data$pain_cat, complex_data$pain)
## [1] 0.4581928
cor(complex_data$mindfulness, complex_data$pain)
## [1] -0.2964476
cor(complex_data$cortisol, complex_data$pain)
## [1] 0.4998132
Homoscedasticty assumption (homogeneity of variance)
ncvTest(complex_model)
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 0.1507902, Df = 1, p = 0.69778
Multicollinearity assumption
vif(complex_model)
## sex age STAI_trait pain_cat mindfulness cortisol
## 1.143878 1.466234 2.033713 1.850074 1.519735 1.633069
Compare the two models.
anova(simple_model, complex_model)
## Analysis of Variance Table
##
## Model 1: pain ~ sex + age
## Model 2: pain ~ sex + age + STAI_trait + pain_cat + mindfulness + cortisol
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 157 360.86
## 2 153 255.25 4 105.61 15.826 7.337e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1